In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df = pd.read_csv("dataset.csv")
df.head()
Out[2]:
| VIN (1-10) | County | City | State | Postal Code | Model Year | Make | Model | Electric Vehicle Type | Clean Alternative Fuel Vehicle (CAFV) Eligibility | Electric Range | Base MSRP | Legislative District | DOL Vehicle ID | Vehicle Location | Electric Utility | 2020 Census Tract | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | JTMEB3FV6N | Monroe | Key West | FL | 33040 | 2022 | TOYOTA | RAV4 PRIME | Plug-in Hybrid Electric Vehicle (PHEV) | Clean Alternative Fuel Vehicle Eligible | 42 | 0 | NaN | 198968248 | POINT (-81.80023 24.5545) | NaN | 12087972100 |
| 1 | 1G1RD6E45D | Clark | Laughlin | NV | 89029 | 2013 | CHEVROLET | VOLT | Plug-in Hybrid Electric Vehicle (PHEV) | Clean Alternative Fuel Vehicle Eligible | 38 | 0 | NaN | 5204412 | POINT (-114.57245 35.16815) | NaN | 32003005702 |
| 2 | JN1AZ0CP8B | Yakima | Yakima | WA | 98901 | 2011 | NISSAN | LEAF | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 73 | 0 | 15.0 | 218972519 | POINT (-120.50721 46.60448) | PACIFICORP | 53077001602 |
| 3 | 1G1FW6S08H | Skagit | Concrete | WA | 98237 | 2017 | CHEVROLET | BOLT EV | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 238 | 0 | 39.0 | 186750406 | POINT (-121.7515 48.53892) | PUGET SOUND ENERGY INC | 53057951101 |
| 4 | 3FA6P0SU1K | Snohomish | Everett | WA | 98201 | 2019 | FORD | FUSION | Plug-in Hybrid Electric Vehicle (PHEV) | Not eligible due to low battery range | 26 | 0 | 38.0 | 2006714 | POINT (-122.20596 47.97659) | PUGET SOUND ENERGY INC | 53061041500 |
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 112634 entries, 0 to 112633 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 VIN (1-10) 112634 non-null object 1 County 112634 non-null object 2 City 112634 non-null object 3 State 112634 non-null object 4 Postal Code 112634 non-null int64 5 Model Year 112634 non-null int64 6 Make 112634 non-null object 7 Model 112614 non-null object 8 Electric Vehicle Type 112634 non-null object 9 Clean Alternative Fuel Vehicle (CAFV) Eligibility 112634 non-null object 10 Electric Range 112634 non-null int64 11 Base MSRP 112634 non-null int64 12 Legislative District 112348 non-null float64 13 DOL Vehicle ID 112634 non-null int64 14 Vehicle Location 112610 non-null object 15 Electric Utility 112191 non-null object 16 2020 Census Tract 112634 non-null int64 dtypes: float64(1), int64(6), object(10) memory usage: 14.6+ MB
In [ ]:
df.describe()
Out[ ]:
| Postal Code | Model Year | Electric Range | Base MSRP | Legislative District | DOL Vehicle ID | 2020 Census Tract | |
|---|---|---|---|---|---|---|---|
| count | 112634.000000 | 112634.000000 | 112634.000000 | 112634.000000 | 112348.000000 | 1.126340e+05 | 1.126340e+05 |
| mean | 98156.226850 | 2019.003365 | 87.812987 | 1793.439681 | 29.805604 | 1.994567e+08 | 5.296650e+10 |
| std | 2648.733064 | 2.892364 | 102.334216 | 10783.753486 | 14.700545 | 9.398427e+07 | 1.699104e+09 |
| min | 1730.000000 | 1997.000000 | 0.000000 | 0.000000 | 1.000000 | 4.777000e+03 | 1.101001e+09 |
| 25% | 98052.000000 | 2017.000000 | 0.000000 | 0.000000 | 18.000000 | 1.484142e+08 | 5.303301e+10 |
| 50% | 98119.000000 | 2020.000000 | 32.000000 | 0.000000 | 34.000000 | 1.923896e+08 | 5.303303e+10 |
| 75% | 98370.000000 | 2022.000000 | 208.000000 | 0.000000 | 43.000000 | 2.191899e+08 | 5.305307e+10 |
| max | 99701.000000 | 2023.000000 | 337.000000 | 845000.000000 | 49.000000 | 4.792548e+08 | 5.603300e+10 |
In [ ]:
df.isnull().sum()
Out[ ]:
VIN (1-10) 0 County 0 City 0 State 0 Postal Code 0 Model Year 0 Make 0 Model 20 Electric Vehicle Type 0 Clean Alternative Fuel Vehicle (CAFV) Eligibility 0 Electric Range 0 Base MSRP 0 Legislative District 286 DOL Vehicle ID 0 Vehicle Location 24 Electric Utility 443 2020 Census Tract 0 dtype: int64
In [3]:
Data = df.dropna()
In [4]:
Data.duplicated().sum()
Out[4]:
0
In [ ]:
In [ ]:
Task 1:Apply Exploratory Data Analysis (Univariate and Bivariate) on the dataset available above.
Exploratory Data Analysis (Electric Vehicles)
Univariate Analysis
In [ ]:
# Distribution of Electronic Vehicle Types
sns.countplot(data = Data, y = "Electric Vehicle Type")
plt.show()
In [ ]:
# Top 5 Electronic Vechile Brands
Top_5 = Data["Make"].value_counts().head(5)
sns.barplot(x = Top_5.values, y = Top_5.index)
plt.show()
In [ ]:
# Distribution of Model Year
sns.histplot(Data["Model Year"], bins = 20, kde = True)
plt.show
Out[ ]:
<function matplotlib.pyplot.show(close=None, block=None)>
In [ ]:
Bivariate Analysis
In [ ]:
sns.scatterplot(data = Data, x = "Electric Range" , y = "Base MSRP" , hue = "Electric Vehicle Type")
plt.show()
In [ ]:
plt.figure(figsize = (12, 6))
sns.boxplot(data = Data, x = "Make", y = "Electric Range")
plt.xticks(rotation=90)
plt.show()
In [ ]:
Task 2: Create a Choropleth using plotly.express to display the number of EV vehicles based on location.
In [6]:
import plotly.express as px
In [7]:
fig = px.choropleth(df, locations='State',
locationmode='USA-states',
color='Make',
hover_name='Make',
scope='usa',
title='Distribution of EVs by State')
fig.show()
In [8]:
fig = px.choropleth(df, locations='State',
locationmode='USA-states',
color='Make',
animation_frame='Model Year',
scope='usa',
title='Year-wise Distribution of EVs by State')
fig.show()
In [9]:
Cars_year = Data.groupby(['Make', 'Model Year']).size().reset_index(name='vehicle_count')
Cars_year = Cars_year.sort_values(by='Model Year')
Cars_year = Cars_year[(Cars_year['Model Year'] >= 2016) & (Cars_year['Model Year'] <= 2023)]
fig = px.bar(
Cars_year,
x='vehicle_count',
y='Make',
color='Make',
animation_frame='Model Year',
orientation='h',
title='EV Makes Count Over the Years',
labels={'vehicle_count': 'Number of Vehicles'},
range_x=[0, Cars_year['vehicle_count'].max() + 50]
)
fig.update_layout(
xaxis=dict(showgrid=True, gridcolor='LightGray', tickformat=',d'),
yaxis_title='Make',
xaxis_title='Number of Vehicles',
showlegend=False,
title_x=0.5,
margin=dict(l=50, r=50, t=50, b=50),
width=800,
height=600
)
fig.show()
In [ ]:
import bar_chart_race as bcr
In [ ]:
ev_counts = Data.groupby(['Model Year', 'Make']).size().unstack(fill_value=0)
bcr.bar_chart_race(
df=ev_counts,
filename="ev_racing_bar.mp4",
orientation="h",
sort="desc",
n_bars=10,
steps_per_period=50,
period_length=2000,
title={'label': 'Year-wise Sales of EV Brands', 'size': 32},
bar_label_font={'size': 18},
tick_label_font={'size': 14},
fig_kwargs={'figsize': (16, 9), 'dpi': 100},
)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: